# load packages
suppressPackageStartupMessages({
  library(dplyr)
  library(ggplot2)
  library(zoo)
  library(xts)
  library(tseries)
  library(forecast)
  library(TSA)
  library(reshape2)
  library(naniar)
  library(lubridate)
  library(Metrics)
})

Import and process data

# import data
raw.df <- read.csv("city_temperature.csv")

# remove observations with year = 200, 201 or 2020 (only partial data), Day = 0 
wip.df <- raw.df %>% 
  filter(!Year %in% (200:201)) %>%
  filter(Day != 0)

# date data type
wip.df <- wip.df %>%
  mutate(Date = as.Date(with(wip.df,paste(Year,Month,Day,sep="-")),
                        "%Y-%m-%d")) %>%
  dplyr::select(-c("Month","Day","Year"))

# number of days between the start and end date (inclusive)
n_days <- max(wip.df$Date) - min(wip.df$Date) + 1

Subset US cities

# list of US cities of interest
CitiesOfInterest <- c("Chicago",
                      "Honolulu",
                      "San Diego",
                      "Kansas City",
                      "Las Vegas",
                      "Los Angeles",
                      "Miami Beach",
                      "Minneapolis St. Paul",
                      "New York City",
                      "Phoenix",
                      "Raleigh Durham",
                      "San Antonio")

# subset US cities of interest 
df.usa <- wip.df %>% dplyr::filter((Country=="US") 
                                     & (City %in% CitiesOfInterest) 
                                     & (State!="Additional Territories")) %>%
  dplyr::select(-c("Region", "Country", "State")) # drop redundant columns

# reshape data set, one column per city
df_melt <- melt(df.usa, id.vars = c("Date","City"))
df.usa2 <- dcast(df_melt, Date ~ City)

# update values where `AvgTemperature` = -99 by forward filling 
df.usa2 <- df.usa2 %>% 
  naniar::replace_with_na_all(condition = ~.x == -99) %>%
  na.locf(fromLast = FALSE)

Univariate Time Series: Daily, Weekly, Monthly Avg Temp in Chicago

Create Test and Train Splits

  • Train : 1995-01-01 - 2019-12-31
  • Test : 2020-01-01 - 2020-05-13
# select Chicago
df <- df.usa2 %>% dplyr::select(c("Date","Chicago")) %>% rename(AvgTemp = "Chicago")
# ts of daily avg temp
ts.d <- df %>% dplyr::select(-c("Date")) %>% ts(start=c(1995,1), frequency=365)
# split test and train
train.d <- window(ts.d, end=c(2019,365))  ;  autoplot(train.d)

test.d <- window(ts.d, start=c(2020,1))   ;  autoplot(test.d)

# check length
length(ts.d) == n_days
## [1] TRUE
length(ts.d) == length(train.d) + length(test.d)
## [1] TRUE
# weekly avg temp
ts.w <- df %>%
  # group by week, use week start date
  group_by(WeekStart = lubridate::floor_date(df$Date,unit="week")) %>%
  # calculate avg
  summarize(AvgTemp = mean(AvgTemp) %>% round(2)) %>% 
  # drop "WeekStart" date before converting to ts
  dplyr::select(-c("WeekStart")) %>%
  # convert to time series object
  ts(start=c(1995,1), frequency=52)
# split test and train
train.w <- window(ts.w, start=c(1995,1), end=c(2019,52))  ;  autoplot(train.w)

test.w <- window(ts.w, start=c(2020,1))  ;  autoplot(test.w)

# check length
length(ts.w) == length(train.w) + length(test.w)
## [1] TRUE
# monthly avg temp
ts.m <- df %>%
  # group by YearMonth
  group_by(YearMonth = as.yearmon(Date)) %>%
  # calculate avg
  summarize(AvgTemp = mean(AvgTemp) %>% round(2)) %>% 
  # drop "YearMonth" date before converting to ts
  dplyr::select(-c("YearMonth")) %>%
  # convert to time series object
  ts(start=c(1995,1), frequency=12)
# split test and train
train.m <- window(ts.m, end=c(2019,12))  ;  autoplot(train.m)

test.m <- window(ts.m, start=c(2020,1))  ;  autoplot(test.m)

# check length
length(ts.m) == length(train.m) + length(test.m)
## [1] TRUE

#STL Decomposition

STL_D = stl(train.d[,1],s.window = "periodic")

plot(STL_D, main="Daily Decomposition")

STL_W = stl(train.w[,1],s.window = "periodic")

plot(STL_W, main="Weekly Decomposition")

STL_M = stl(train.m[,1],s.window = "periodic")

plot(STL_M, main="Monthly Decomposition")

Model

#Set Frequency for TS for STL

ts.sd.w = ts(ts.w[1:1324,1],frequency = 52)
# split test and train
train.w.sd  = ts.sd.w[1:1300]
tstrain.sd.w = ts(train.w.sd[1:1300],frequency = 52)
test.w.sd = ts.sd.w[1301:1324]
tstest.sd.w = ts(test.w.sd[1:24],frequency = 52)
# check length
length(ts.sd.w) == length(train.w.sd) + length(test.w.sd)
## [1] TRUE

MODELING

#STL on dataset

STL_SD_W = stl(ts.sd.w,s.window="periodic")

fcst_sd_w = forecast(STL_SD_W,h=length(test.w.sd))

plot(fcst_sd_w)

checkresiduals(fcst_sd_w)
## Warning in checkresiduals(fcst_sd_w): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 233.52, df = 102, p-value = 2.577e-12
## 
## Model df: 2.   Total lags used: 104
Box.test(fcst_sd_w,type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  fcst_sd_w
## X-squared = 10.435, df = 1, p-value = 0.001237
ts.plot(residuals(fcst_sd_w),ylab="Residuals",xlab="",main="Residuals from STL Model")

acf(residuals(fcst_sd_w),main="ACF of STL Residuals")

#STL on train dataset 

STL_SD_W_train = stl(train.w[,1],s.window = "periodic")

plot(STL_SD_W_train, main="STL Train Decomposition")

fcst_sd_w_train = forecast(STL_SD_W_train,h=length(test.w))

plot(fcst_sd_w_train,main="Forecasts from STL and ARIMA")

#Check Accuracy

acc_train = forecast::accuracy(fcst_sd_w_train,test.w)

acc_train
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  0.01646531 6.013044 4.588315  -5.09680 14.93285 0.6917317
## Test set     -3.99185563 8.902500 7.484941 -10.17876 19.56353 1.1284254
##                   ACF1 Theil's U
## Training set 0.1392196        NA
## Test set     0.5301805  1.183986
#SMAPE
smape_train = smape(train.w,fcst_sd_w_train$residuals)
smape_test = smape(test.w,fcst_sd_w_train$mean)

#Residuals
checkresiduals(fcst_sd_w_train)
## Warning in checkresiduals(fcst_sd_w_train): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 236.6, df = 102, p-value = 1.054e-12
## 
## Model df: 2.   Total lags used: 104
#Ljung Box Test
Box.test(fcst_sd_w_train,type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  fcst_sd_w_train
## X-squared = 10.49, df = 1, p-value = 0.0012
ts.plot(residuals(fcst_sd_w_train),ylab="Residuals",xlab="",main="Residuals from STL Train Model")

acf(residuals(fcst_sd_w_train),main="ACF of STL Train Residuals")

#STL on train dataset V1

STL_SD_W_train_v1 = stl(train.w[,1],s.window = 10,t.window=13)


plot(STL_SD_W_train_v1, main="STL Train Decomposition")

fcst_sd_w_train_v1 = forecast(STL_SD_W_train_v1,h=length(test.w))

plot(fcst_sd_w_train_v1,main="Forecasts from STL and ARIMA")

#Check Accuracy

acc_train_v1 = forecast::accuracy(fcst_sd_w_train_v1,test.w)

acc_train
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  0.01646531 6.013044 4.588315  -5.09680 14.93285 0.6917317
## Test set     -3.99185563 8.902500 7.484941 -10.17876 19.56353 1.1284254
##                   ACF1 Theil's U
## Training set 0.1392196        NA
## Test set     0.5301805  1.183986
#SMAPE
smape_train_v1 = smape(train.w,fcst_sd_w_train_v1$residuals)
smape_test_v1 = smape(test.w,fcst_sd_w_train_v1$mean)

#Residuals
checkresiduals(fcst_sd_w_train_v1)
## Warning in checkresiduals(fcst_sd_w_train_v1): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 388.83, df = 102, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 104
#Ljung Box Test
Box.test(fcst_sd_w_train_v1,type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  fcst_sd_w_train_v1
## X-squared = 9.893, df = 1, p-value = 0.001659
ts.plot(residuals(fcst_sd_w_train_v1),ylab="Residuals",xlab="",main="Residuals from STL Train Model")

acf(residuals(fcst_sd_w_train_v1),main="ACF of STL Train Residuals")

#MSTL on train dataset

STL_SD_W_train_v2 = mstl(train.w[,1],s.window = 13)


plot(STL_SD_W_train_v2, main="STL Train Decomposition")

fcst_sd_w_train_v2 = forecast(STL_SD_W_train_v2,h=length(test.w))

plot(fcst_sd_w_train_v2,main="Forecasts from STL and ARIMA")

#Check Accuracy

acc_train_v2= forecast::accuracy(fcst_sd_w_train_v2,test.w)

acc_train
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  0.01646531 6.013044 4.588315  -5.09680 14.93285 0.6917317
## Test set     -3.99185563 8.902500 7.484941 -10.17876 19.56353 1.1284254
##                   ACF1 Theil's U
## Training set 0.1392196        NA
## Test set     0.5301805  1.183986
#SMAPE
smape_train_v2 = smape(train.w,fcst_sd_w_train_v2$residuals)
smape_test_v2 = smape(test.w,fcst_sd_w_train_v2$mean)

#Residuals
checkresiduals(fcst_sd_w_train_v2)
## Warning in checkresiduals(fcst_sd_w_train_v2): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 346.19, df = 102, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 104
#Ljung Box Test
Box.test(fcst_sd_w_train_v2,type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  fcst_sd_w_train_v2
## X-squared = 9.221, df = 1, p-value = 0.002393
ts.plot(residuals(fcst_sd_w_train_v2),ylab="Residuals",xlab="",main="Residuals from STL Train Model")

acf(residuals(fcst_sd_w_train_v2),main="ACF of STL Train Residuals")